from transformers import LlamaTokenizer, LlamaForCausalLM

tokenizer = LlamaTokenizer.from_pretrained("yulan-team/YuLan-Chat-2-13b-fp16")
model = LlamaForCausalLM.from_pretrained("yulan-team/YuLan-Chat-2-13b-fp16").cuda()
model = model.eval()
input_text = "hello"
prompt = "The following is a conversation between a human and an AI assistant namely YuLan, developed by GSAI, Renmin University of China. The AI assistant gives helpful, detailed, and polite answers to the user's questions.\n[|Human|]:{}\n[|AI|]:".format(input_text)
inputs = tokenizer(prompt, return_tensors='pt', padding="longest", max_length=8192, truncation=True, return_attention_mask=True, add_special_tokens=True)
kwargs = {'temperature': 0.8, 'top_p': 0.95, "top_k": 50, "repetition_penalty": 1.1, "no_repeat_ngram_size": 64, "max_length": 8192, "pad_token_id": tokenizer.bos_token_id, "eos_token_id": tokenizer.eos_token_id}
outputs = model.generate(inputs['input_ids'].to(model.device), attention_mask=inputs['attention_mask'].to(model.device), do_sample=True, **kwargs)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[len(prompt):])